In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as py 
import seaborn as sns
import folium
from folium.plugins import TimeSliderChoropleth
from branca.element import Figure
from scipy import stats
import statsmodels.api as sm 
import geopandas as gpd
sns.set()

MAST 30034 Assignment 1 Source Code

In [166]:
import urllib
import os
# go to tlc web
# get the url link address
months = ["12", "01", "02"]
years = [2017, 2018]
for year in years:
    for month in months:
        fname=os.getcwd()+f'/yellow_tripdata_{year}-{month}.csv'
        url = f'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year}-{month}.csv'
        urllib.request.urlretrieve(url, fname)
In [1]:
import urllib
import os
#month_year = [[2016, "12"], [2019, "01"], [2019, "02"]]
month_year = [[2016, "12"]]
for date in month_year:
    fname=os.getcwd()+f'/yellow_tripdata_{date[0]}-{date[1]}.csv'
    url = f'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{date[0]}-{date[1]}.csv'
    urllib.request.urlretrieve(url, fname)
In [4]:
taxi = pd.read_csv("yellow_tripdata_2016-12.csv")
taxi = taxi.reset_index()
taxi.drop(labels=[1, 2], axis=1, inplace = True)
taxi.columns = list(taxi.columns[2:])+[1, 2]
taxi.to_csv("yellow_tripdata_2016-12.csv", index=False)
In [223]:
# getting the snowfall data
snow_data = pd.read_csv('snow.csv')

Snow Preprocessing

In [217]:
snow_data.head()
Out[217]:
Unnamed: 0 Date MaxTemp MinTemp Average New Snow Snow Depth
0 0 2017-12-01 51 38 44.5 0.0 0.0
1 1 2017-12-02 45 32 38.5 0.0 0.0
2 2 2017-12-03 47 36 41.5 0.0 0.0
3 3 2017-12-04 48 33 40.5 0.0 0.0
4 4 2017-12-05 59 48 53.5 0.0 0.0
In [21]:
snow_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        270 non-null    datetime64[ns]
 1   MaxTemp     270 non-null    int64         
 2   MinTemp     270 non-null    int64         
 3   Average     270 non-null    float64       
 4   New Snow    270 non-null    object        
 5   Snow Depth  270 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 12.8+ KB
In [47]:
# only for flag
snow_data.loc[snow_data['New Snow']=='T', 'New Snow'] = -100
snow_data.loc[snow_data['New Snow']=='M', 'New Snow'] = -100
snow_data.loc[snow_data['Snow Depth']=='T', 'Snow Depth'] = -100
snow_data.loc[snow_data['Snow Depth']=='M', 'Snow Depth'] = -100
In [48]:
lst = ['New Snow', 'Snow Depth']
for each_snow in lst:
    snow_data[each_snow] = snow_data[each_snow].apply(lambda x: float(x))
In [49]:
# Since this is a weather data, I made a decision to take an average of the past 2 days 
for each_snow in lst:
    index_replaced = snow_data[snow_data[each_snow]==-100].index
    for each_index in index_replaced:
        snow_data.loc[each_index, each_snow] = np.mean(snow_data.loc[each_index-2:each_index-1, each_snow])
In [54]:
for each_snow in lst:
    snow_data[each_snow] = round(snow_data[each_snow], 1)
In [224]:
lst_temp = ['MaxTemp', 'MinTemp', 'Average']
for temp in lst_temp:
    snow_data[temp] = snow_data[temp].apply(lambda x: int((x-32)*5/9))
In [226]:
snow_data.to_csv('snow.csv')

Done for snow data

Taxi Data Preprocessing

First layer of preprocessing

In [4]:
taxi = pd.read_csv('yellow_tripdata_2018-12.csv')
In [219]:
newark = taxi[taxi['RatecodeID']==3]
In [226]:
newark[newark['fare_amount']<17.5]
Out[226]:
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
423876 1 2018-12-02 12:01:03 2018-12-02 12:28:31 4 17.00 3 N 50 1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0
455915 2 2018-12-02 14:53:13 2018-12-02 15:22:35 1 19.57 3 N 158 1 2 -70.0 0.0 0.0 0.0 -20.0 -0.3 -90.3
470347 2 2018-12-02 15:36:10 2018-12-02 15:39:17 1 0.17 3 N 186 186 2 -21.5 0.0 0.0 0.0 0.0 -0.3 -21.8
481842 2 2018-12-02 16:10:26 2018-12-02 16:12:48 1 0.00 3 N 230 230 4 -21.0 0.0 0.0 0.0 0.0 -0.3 -21.3
539919 2 2018-12-02 20:01:59 2018-12-02 20:05:27 2 0.51 3 N 151 238 3 -22.0 -0.5 0.0 0.0 0.0 -0.3 -22.8
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7829386 2 2018-12-30 11:39:31 2018-12-30 11:39:43 2 0.00 3 N 230 230 4 -20.0 0.0 0.0 0.0 0.0 -0.3 -20.3
7878330 2 2018-12-30 15:32:05 2018-12-30 15:32:10 1 0.00 3 N 158 158 3 -20.0 0.0 0.0 0.0 0.0 -0.3 -20.3
7985601 2 2018-12-31 07:44:55 2018-12-31 07:50:49 3 0.98 3 N 164 137 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8057347 2 2018-12-31 15:13:19 2018-12-31 15:13:30 3 0.01 3 N 161 161 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8091530 2 2018-12-31 17:09:37 2018-12-31 17:09:44 1 0.00 3 N 264 262 3 -20.0 -1.0 0.0 0.0 0.0 -0.3 -21.3

80 rows × 17 columns

In [160]:
type(taxi['tpep_pickup_datetime'][0])
Out[160]:
str
In [161]:
taxi['tpep_dropoff_datetime'] = pd.to_datetime(taxi['tpep_dropoff_datetime'])
taxi['tpep_pickup_datetime'] = pd.to_datetime(taxi['tpep_pickup_datetime'])
taxi['time_taken'] = taxi['tpep_dropoff_datetime']-taxi['tpep_pickup_datetime']
In [162]:
taxi['time_taken'] = taxi['time_taken'].apply(lambda x: x.seconds/3600)
taxi['time_taken'] = taxi['time_taken'].astype('float')
In [163]:
#taxi['time_taken'] = taxi['time_taken'].astype('float')
taxi['speed'] = taxi['trip_distance']/taxi['time_taken']
In [165]:
taxi['speed']
Out[165]:
0           9.523810
1           7.212544
2           0.000000
3          29.068323
4          14.404501
             ...    
8173226     7.107861
8173227    14.125915
8173228     9.712230
8173229    15.333333
8173230     0.000000
Name: speed, Length: 8173231, dtype: float64
In [119]:
# example of dirty date
taxi['Date'] = taxi['tpep_pickup_datetime'].apply(lambda x: x.split()[0])
taxi['Date'].unique()
Out[119]:
array(['2017-12-01', '2017-11-30', '2009-01-01', '2008-12-31',
       '2017-12-02', '2017-12-03', '2017-12-04', '2017-12-05',
       '2017-12-06', '2017-12-07', '2017-12-08', '2017-12-09',
       '2017-12-10', '2017-12-11', '2017-12-12', '2017-12-13',
       '2017-12-14', '2017-12-15', '2017-12-16', '2017-12-17',
       '2017-12-18', '2018-01-20', '2003-01-14', '2017-12-19',
       '2017-12-20', '2017-12-21', '2017-12-22', '2017-12-23',
       '2017-12-24', '2017-12-25', '2017-12-26', '2017-12-27',
       '2017-12-28', '2018-01-17', '2018-02-07', '2018-03-01',
       '2017-12-29', '2003-01-01', '2017-12-30', '2017-12-31',
       '2018-01-01'], dtype=object)
In [85]:
taxi = taxi[taxi['speed']<55]
taxi = taxi[taxi['time_taken']>0]
taxi['trip_count'] = 1
taxi_201701 = taxi.groupby(by='Date').sum().reset_index()
In [105]:
taxi_201701_mean = taxi.groupby(by='Date').mean().reset_index()
In [64]:
taxi.isnull().sum()
Out[64]:
VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
dtype: int64
In [65]:
taxi.head()
Out[65]:
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
0 1 2017-01-09 11:13:28 2017-01-09 11:25:45 1 3.30 1 N 263 161 1 12.5 0.0 0.5 2.00 0.0 0.3 15.30
1 1 2017-01-09 11:32:27 2017-01-09 11:36:01 1 0.90 1 N 186 234 1 5.0 0.0 0.5 1.45 0.0 0.3 7.25
2 1 2017-01-09 11:38:20 2017-01-09 11:42:05 1 1.10 1 N 164 161 1 5.5 0.0 0.5 1.00 0.0 0.3 7.30
3 1 2017-01-09 11:52:13 2017-01-09 11:57:36 1 1.10 1 N 236 75 1 6.0 0.0 0.5 1.70 0.0 0.3 8.50
4 2 2017-01-01 00:00:00 2017-01-01 00:00:00 1 0.02 2 N 249 234 2 52.0 0.0 0.5 0.00 0.0 0.3 52.80
In [67]:
taxi[taxi['passenger_count']==0].shape
Out[67]:
(532, 17)
In [68]:
taxi['RatecodeID'].unique()
Out[68]:
array([ 1,  2,  5,  3,  4,  6, 99])
In [69]:
taxi['Date'] = taxi['tpep_pickup_datetime'].apply(lambda x: x.split()[0])
In [72]:
taxi['time_taken'] = taxi['time_taken'].apply(lambda x: x.seconds/3600) 
In [77]:
taxi['payment_type']
Out[77]:
0          1
1          1
2          1
3          1
4          2
          ..
9710119    1
9710120    1
9710121    1
9710122    1
9710123    1
Name: payment_type, Length: 9710124, dtype: int64
In [18]:
def preprocessing(month, year, filename):
    
    data = pd.read_csv(filename)
    data = data[data['passenger_count']>0]
    # take out the payment by credit for tip amount
    data = data.loc[data['payment_type'].isin([1, 2]), :]
    data = data.loc[data['RatecodeID'].isin([1, 2, 3, 4, 5, 6]), :]
    # get only trip distance that is positive and also fare amount
    data = data[data['trip_distance']>0]
    # nyc minimum fare is 2.5
    data = data[data['fare_amount']>2.5]
    
    
    if month=='01' or month=='12':
        date = [i for i in range(1, 32)]
    else:
        date = [i for i in range(1, 29)]
        
    # Extract the date
    data['Date'] = data['tpep_pickup_datetime'].apply(lambda x: x.split()[0])
    # clean the date
    lst_date = data['Date'].unique()
    lst_date = [i for i in lst_date if i[5:7] == month and i[0:4] == year and int(i[8:]) in date]
    data = data[data['Date'].isin(lst_date)]
    
    data = data.loc[:, ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 
                        'trip_distance', 'RatecodeID', 'PULocationID', 
                        'DOLocationID','tip_amount', 'total_amount','fare_amount','extra', 'tolls_amount','Date', 
                        'payment_type']]
    
    # check the speed and time
    data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])
    data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
    data['time_taken'] = data['tpep_dropoff_datetime']-data['tpep_pickup_datetime']
    data['time_taken'] = data['time_taken'].apply(lambda x: x.seconds/3600) 
    data['time_taken'] = data['time_taken'].astype('float')
    data['speed'] = data['trip_distance']/data['time_taken']
    
    # check the speed limit
    #data = data[data['speed']<=55]
    data = data[data['time_taken']>0]
    data = data[data['speed']>0]
    data.reset_index().to_feather(filename[:-3]+'feather')
    del data
In [19]:
# first layer of preprocessing
#dates = [['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
dates = [['12', '2016']]
for each_dates in dates:
    print('Month:', each_dates[0])
    print('Year:', each_dates[1])
    preprocessing(each_dates[0], each_dates[1], f'yellow_tripdata_{each_dates[1]}-{each_dates[0]}.csv')
Month: 12
Year: 2016

Second layer of preprocessing

In [6]:
zone = pd.read_csv('taxi+_zone_lookup.csv')
In [48]:
taxi = pd.read_feather('yellow_tripdata_2018-12.feather')
In [49]:
taxi.columns
Out[49]:
Index(['index', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'tip_amount', 'total_amount', 'fare_amount', 'extra', 'tolls_amount',
       'Date', 'payment_type', 'time_taken', 'speed'],
      dtype='object')
In [211]:
zone[zone['LocationID']==52]
Out[211]:
LocationID Borough Zone service_zone
51 52 Brooklyn Cobble Hill Boro Zone
In [215]:
jfk_check = taxi[taxi['RatecodeID']==2]
In [216]:
jfk_check[jfk_check['fare_amount']!=52]
Out[216]:
index tpep_pickup_datetime tpep_dropoff_datetime trip_distance RatecodeID PULocationID DOLocationID tip_amount total_amount fare_amount extra tolls_amount Date payment_type time_taken speed
419380 431654 2018-12-02 12:42:24 2018-12-02 13:25:35 18.00 2 132 230 0.0 66.56 66.56 0.0 0.0 2018-12-02 1 0.719722 25.009649
814410 837733 2018-12-03 23:33:19 2018-12-04 00:15:08 24.86 2 148 265 18.0 130.30 101.00 0.0 10.5 2018-12-03 1 0.696944 35.669988
1507516 1549786 2018-12-06 12:23:43 2018-12-06 12:46:21 2.98 2 237 107 0.0 16.80 16.00 0.0 0.0 2018-12-06 2 0.377222 7.899853
4852277 4992290 2018-12-18 05:28:57 2018-12-18 06:01:21 18.70 2 263 132 0.0 64.80 64.00 0.0 0.0 2018-12-18 1 0.540000 34.629630

Attributes that is prone to logic error in its data: speed, time taken, trip distance, fare amount, tolls and extra

In [8]:
inv_data = taxi[['speed', 'time_taken', 'trip_distance', 'fare_amount', 'tolls_amount', 'extra']]
inv_data.describe()
Out[8]:
speed time_taken trip_distance fare_amount tolls_amount extra
count 7.939638e+06 7.939638e+06 7.939638e+06 7.939638e+06 7.939638e+06 7.939638e+06
mean 1.144871e+01 3.159528e-01 2.917850e+00 1.307901e+01 3.423173e-01 3.210860e-01
std 6.638576e+01 1.271257e+00 3.764626e+00 1.116837e+01 1.629153e+00 4.577783e-01
min 4.188823e-04 2.777778e-04 1.000000e-02 3.000000e+00 0.000000e+00 -5.000000e-01
25% 6.857143e+00 1.125000e-01 9.600000e-01 6.500000e+00 0.000000e+00 0.000000e+00
50% 9.591474e+00 1.927778e-01 1.600000e+00 9.500000e+00 0.000000e+00 0.000000e+00
75% 1.333333e+01 3.222222e-01 3.000000e+00 1.500000e+01 0.000000e+00 5.000000e-01
max 6.372000e+04 2.399917e+01 6.023000e+02 5.635000e+02 9.000100e+02 2.020000e+01
In [264]:
inv_data = inv_data[inv_data['fare_amount']>0]
In [269]:
min(inv_data['speed'])
Out[269]:
0.00041888228244301454
In [270]:
inv_data['ratio_fare_speed'] = inv_data['fare_amount']/inv_data['speed']
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [277]:
inv_data['ratio_fare_speed'].describe()
Out[277]:
count    7.547595e+06
mean     1.311254e+00
std      8.490189e-01
min      2.888128e-01
25%      6.852599e-01
50%      1.060764e+00
75%      1.695875e+00
max      4.566475e+00
Name: ratio_fare_speed, dtype: float64
In [278]:
inv_data[inv_data['ratio_fare_speed']==max(inv_data['ratio_fare_speed'])]
Out[278]:
speed time_taken trip_distance fare_amount tolls_amount extra ratio_fare_speed
1223125 4.81772 0.601944 2.9 22.0 0.0 0.0 4.566475
1540374 4.81772 0.601944 2.9 22.0 0.0 0.0 4.566475
1561125 4.81772 0.601944 2.9 22.0 0.0 1.0 4.566475

As you can see, this ratio can capture those data that does not make any sense, as 600 miles can be taken with only 0.04 seconds.

In [276]:
inv_data = inv_data[inv_data['ratio_fare_speed']<inv_data['ratio_fare_speed'].quantile(q=0.975)]
inv_data = inv_data[inv_data['ratio_fare_speed']>inv_data['ratio_fare_speed'].quantile(q=0.025)]
In [255]:
inv_data.isnull().sum()
Out[255]:
speed               0
time_taken          0
trip_distance       0
fare_amount         0
tolls_amount        0
extra               0
ratio_fare_speed    0
dtype: int64
In [ ]:
plt.figure(figsize=(12, 8))
plt.plot(inv_data['speed'], inv_data['fare_amount'])
plt.show()

Clearly, it can be seen that the data contain some errors, for example, the max speed is 63272 miles per hour, which does not make any sense.

In [283]:
taxi[taxi['tolls_amount']==max(taxi['tolls_amount'])]
Out[283]:
index tpep_pickup_datetime tpep_dropoff_datetime trip_distance RatecodeID PULocationID DOLocationID tip_amount total_amount fare_amount extra tolls_amount Date payment_type time_taken speed
3052678 3140122 2018-12-11 21:41:16 2018-12-11 22:20:17 14.7 1 90 11 0.0 946.31 45.0 0.5 900.01 2018-12-11 2 0.650278 22.605724
In [284]:
zone[zone['LocationID'].isin([90, 11])]
Out[284]:
LocationID Borough Zone service_zone
10 11 Brooklyn Bath Beach Boro Zone
89 90 Manhattan Flatiron Yellow Zone
In [13]:
inv_data[inv_data['tolls_amount']>100]
Out[13]:
speed time_taken trip_distance fare_amount tolls_amount extra
277427 96.585366 0.022778 2.20 45.0 810.50 0.0
1089530 13.987730 0.543333 7.60 27.0 508.08 0.5
1116604 25.394191 0.401667 10.20 30.5 508.00 0.5
2312598 11.612903 0.180833 2.10 10.5 185.00 0.5
3052678 22.605724 0.650278 14.70 45.0 900.01 0.5
5577596 17.472067 0.795556 13.90 45.0 812.50 0.0
5581858 17.028668 1.317778 22.44 88.5 119.38 0.0
6278524 42.458998 0.504722 21.43 58.0 200.00 0.0
In [14]:
inv_data[inv_data['extra']<0]
Out[14]:
speed time_taken trip_distance fare_amount tolls_amount extra
3179843 3.106126 0.321944 1.0 12.5 0.0 -0.50
4389350 8.388350 0.143056 1.2 7.5 0.0 -0.50
4425029 23.369803 0.380833 8.9 26.0 0.0 -0.45
7920224 11.646051 0.291944 3.4 15.0 0.0 -0.40
In [20]:
def second_preprocessing(month, year, filename):
    # read the data
    data = pd.read_feather(filename)
    data = data.drop(labels=['index'], axis=1)
    # fare amount needs to be bigger than 2.5
    data = data.loc[data['fare_amount']>0]
    # check the normality of the fare vs speed (taking only 95% quantile)
    data['ratio_fare_speed'] = data['fare_amount']/data['speed']
    data = data[data['ratio_fare_speed']<data['ratio_fare_speed'].quantile(q=0.975)]
    data = data[data['ratio_fare_speed']>data['ratio_fare_speed'].quantile(q=0.025)]
    del data['ratio_fare_speed']
    # normal speed (max=65)
    data = data.loc[data['speed']<65]
    # jkf rate
    data = data.loc[jkf_rate(data), :]
    # newark rate
    data = data.loc[newark_rate(data), :]
    # huge tolls amount will be removed
    data = data.loc[data['tolls_amount']<100, :]
    # remove extra that is less than 0
    data = data[data['extra']>=0]
    
    data.reset_index().to_feather(filename)
    del data
In [21]:
def jkf_rate(data):
    all_index = data.index
    sliced_data = data[data['RatecodeID']==2]
    index_sliced = sliced_data[sliced_data['fare_amount']!=52].index
    set_index = set(all_index)-set(index_sliced)
    return list(set_index)
In [22]:
def newark_rate(data):
    all_index = data.index
    sliced_data = data[data['RatecodeID']==3]
    index_sliced = sliced_data[sliced_data['fare_amount']<17.5].index
    set_index = set(all_index)-set(index_sliced)
    return list(set_index)
In [23]:
# second layer of preprocessing
#dates = [['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
dates = [['12', '2016']]
for each_dates in dates:
    print('Month:', each_dates[0])
    print('Year:', each_dates[1])
    second_preprocessing(each_dates[0], each_dates[1], f'yellow_tripdata_{each_dates[1]}-{each_dates[0]}.feather')
       
Month: 12
Year: 2016

Third Layer Preprocessing

In [39]:
def whisk_outliers(att, df):
    each_customer_transaction = df[att]
    q1 = each_customer_transaction.quantile(q=0.25)
    q3 = each_customer_transaction.quantile(q=0.75)
    iqr = q3 - q1
    left_bound_min = q1 - 1.5*iqr
    right_bound_max = q3 + 1.5*iqr
    return [left_bound_min, right_bound_max]
In [124]:
taxi = pd.read_feather('yellow_tripdata_2018-12.feather')
In [127]:
taxi.drop(labels=['index', 'total_amount', 'payment_type'], axis=1, inplace = True)
taxi.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7546585 entries, 0 to 7546584
Data columns (total 13 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   tpep_pickup_datetime   datetime64[ns]
 1   tpep_dropoff_datetime  datetime64[ns]
 2   trip_distance          float64       
 3   RatecodeID             int64         
 4   PULocationID           int64         
 5   DOLocationID           int64         
 6   tip_amount             float64       
 7   fare_amount            float64       
 8   extra                  float64       
 9   tolls_amount           float64       
 10  Date                   object        
 11  time_taken             float64       
 12  speed                  float64       
dtypes: datetime64[ns](2), float64(7), int64(3), object(1)
memory usage: 748.5+ MB
In [129]:
del taxi
In [93]:
taxi[taxi['fare_dist_ratio'] > 200]
Out[93]:
index tpep_pickup_datetime tpep_dropoff_datetime trip_distance RatecodeID PULocationID DOLocationID tip_amount total_amount fare_amount extra tolls_amount Date payment_type time_taken speed fare_dist_ratio
12653 13120 2018-12-01 00:20:38 2018-12-01 00:20:45 0.03 5 170 162 6.50 72.30 65.0 0.0 0.0 2018-12-01 1 0.001944 15.428571 2166.666667
22928 23776 2018-12-01 01:33:23 2018-12-01 01:33:26 0.01 2 237 237 0.00 52.80 52.0 0.0 0.0 2018-12-01 2 0.000833 12.000000 5200.000000
26534 27507 2018-12-01 02:47:33 2018-12-01 02:47:38 0.02 5 170 170 0.00 39.80 39.0 0.0 0.0 2018-12-01 1 0.001389 14.400000 1950.000000
26535 27508 2018-12-01 02:50:50 2018-12-01 02:50:55 0.03 5 162 162 0.00 39.80 39.0 0.0 0.0 2018-12-01 1 0.001389 21.600000 1300.000000
27128 28131 2018-12-01 02:28:55 2018-12-01 02:29:00 0.01 5 264 232 3.02 18.12 14.3 0.0 0.0 2018-12-01 1 0.001389 7.200000 1430.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7526743 7919032 2018-12-31 21:55:29 2018-12-31 21:55:33 0.03 5 25 25 11.45 57.25 45.0 0.0 0.0 2018-12-31 1 0.001111 27.000000 1500.000000
7527023 7919319 2018-12-31 21:10:11 2018-12-31 21:10:21 0.02 5 246 246 3.76 22.56 18.0 0.0 0.0 2018-12-31 1 0.002778 7.200000 900.000000
7536399 7929013 2018-12-31 22:19:54 2018-12-31 22:20:09 0.10 5 236 236 4.70 65.00 60.0 0.0 0.0 2018-12-31 1 0.004167 24.000000 600.000000
7540943 7933746 2018-12-31 23:40:01 2018-12-31 23:40:06 0.09 5 170 162 10.00 75.30 65.0 0.0 0.0 2018-12-31 1 0.001389 64.800000 722.222222
7545238 7938239 2018-12-31 23:10:39 2018-12-31 23:11:03 0.30 5 164 186 1.00 136.80 135.0 0.0 0.0 2018-12-31 1 0.006667 45.000000 450.000000

2030 rows × 17 columns

In [97]:
taxi[taxi['fare_dist_ratio']>200]
Out[97]:
index tpep_pickup_datetime tpep_dropoff_datetime trip_distance RatecodeID PULocationID DOLocationID tip_amount total_amount fare_amount extra tolls_amount Date payment_type time_taken speed fare_dist_ratio
12653 13120 2018-12-01 00:20:38 2018-12-01 00:20:45 0.03 5 170 162 6.50 72.30 65.0 0.0 0.0 2018-12-01 1 0.001944 15.428571 2166.666667
22928 23776 2018-12-01 01:33:23 2018-12-01 01:33:26 0.01 2 237 237 0.00 52.80 52.0 0.0 0.0 2018-12-01 2 0.000833 12.000000 5200.000000
26534 27507 2018-12-01 02:47:33 2018-12-01 02:47:38 0.02 5 170 170 0.00 39.80 39.0 0.0 0.0 2018-12-01 1 0.001389 14.400000 1950.000000
26535 27508 2018-12-01 02:50:50 2018-12-01 02:50:55 0.03 5 162 162 0.00 39.80 39.0 0.0 0.0 2018-12-01 1 0.001389 21.600000 1300.000000
27128 28131 2018-12-01 02:28:55 2018-12-01 02:29:00 0.01 5 264 232 3.02 18.12 14.3 0.0 0.0 2018-12-01 1 0.001389 7.200000 1430.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7526743 7919032 2018-12-31 21:55:29 2018-12-31 21:55:33 0.03 5 25 25 11.45 57.25 45.0 0.0 0.0 2018-12-31 1 0.001111 27.000000 1500.000000
7527023 7919319 2018-12-31 21:10:11 2018-12-31 21:10:21 0.02 5 246 246 3.76 22.56 18.0 0.0 0.0 2018-12-31 1 0.002778 7.200000 900.000000
7536399 7929013 2018-12-31 22:19:54 2018-12-31 22:20:09 0.10 5 236 236 4.70 65.00 60.0 0.0 0.0 2018-12-31 1 0.004167 24.000000 600.000000
7540943 7933746 2018-12-31 23:40:01 2018-12-31 23:40:06 0.09 5 170 162 10.00 75.30 65.0 0.0 0.0 2018-12-31 1 0.001389 64.800000 722.222222
7545238 7938239 2018-12-31 23:10:39 2018-12-31 23:11:03 0.30 5 164 186 1.00 136.80 135.0 0.0 0.0 2018-12-31 1 0.006667 45.000000 450.000000

2030 rows × 17 columns

index 22928 may indicate a cancelled trip from airport, but it is still recorded

In [114]:
#taxi['tpep_pickup_datetime'][0].time().hour
taxi['tpep_pickup_datetime'].dt.day_name()
Out[114]:
0          Saturday
1          Saturday
2          Saturday
3          Saturday
4          Saturday
             ...   
7546580      Monday
7546581      Monday
7546582      Monday
7546583      Monday
7546584      Monday
Name: tpep_pickup_datetime, Length: 7546585, dtype: object
In [115]:
taxi.columns
Out[115]:
Index(['index', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'tip_amount', 'total_amount', 'fare_amount', 'extra', 'tolls_amount',
       'Date', 'payment_type', 'time_taken', 'speed', 'fare_dist_ratio'],
      dtype='object')
In [24]:
def third_preprocessing(month, year, filename):
    
    # read the data first
    data = pd.read_feather(filename)
    data.drop(labels=['index', 'payment_type', 'total_amount'], inplace = True, axis=1)
    # Only take normal fare distance relation
    data['fare_dist_ratio'] = data['fare_amount']/data['trip_distance']
    data = data.loc[data['fare_dist_ratio']<200, :] # by inspection
    
    data['hour'] = data['tpep_pickup_datetime'].apply(lambda x: x.time().hour)
    data['day'] = data['tpep_pickup_datetime'].dt.day_name()
    
    data.drop(labels=['fare_dist_ratio'], axis=1, inplace = True)
    data.reset_index().to_feather(filename)
    del data
    
In [138]:
taxi = third_preprocessing('01', '2017', 'yellow_tripdata_2017-01.feather')
In [141]:
taxi.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9112595 entries, 0 to 9113566
Data columns (total 15 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   tpep_pickup_datetime   datetime64[ns]
 1   tpep_dropoff_datetime  datetime64[ns]
 2   trip_distance          float64       
 3   RatecodeID             int64         
 4   PULocationID           int64         
 5   DOLocationID           int64         
 6   tip_amount             float64       
 7   fare_amount            float64       
 8   extra                  float64       
 9   tolls_amount           float64       
 10  Date                   object        
 11  time_taken             float64       
 12  speed                  float64       
 13  hour                   int64         
 14  day                    object        
dtypes: datetime64[ns](2), float64(7), int64(4), object(2)
memory usage: 1.1+ GB
In [26]:
# second layer of preprocessing
dates = [['12', '2016'],['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
for each_dates in dates:
    print('Month:', each_dates[0])
    print('Year:', each_dates[1])
    third_preprocessing(each_dates[0], each_dates[1], f'yellow_tripdata_{each_dates[1]}-{each_dates[0]}.feather')
Month: 12
Year: 2016

Fourth Layer Preprocessing

In [42]:
def fourth_preprocessing(month, year):
    """
    This preprocessing is mainly creating multiple dataframe for attribute investigation
    """
    # PU-DOlocation
    data = pd.read_feather(f'yellow_tripdata_{year}-{month}.feather')
    data.drop(labels=['index'], inplace = True, axis=1)
    data['trip_count'] = 1
    
    date_pu_do_freq = data.groupby(by=['PULocationID', 'DOLocationID', 'Date', 'RatecodeID']).sum().reset_index()
    date_pu_do_freq = date_pu_do_freq.loc[:, ['PULocationID', 'DOLocationID', 'Date', 'RatecodeID', 'trip_count', 
                                              'trip_distance']]
    
    date_pu_do_mean = data.groupby(by=['PULocationID', 'DOLocationID', 'Date']).mean().reset_index()
    date_pu_do_mean.drop(labels=['RatecodeID', 'hour', 'trip_count'], axis=1, inplace = True)
    
    date_pu_do_freq.reset_index().to_feather(f'pu-do-date-{month}-{year}-freq.feather')
    date_pu_do_mean.reset_index().to_feather(f'pu-do-date-{month}-{year}-mean.feather')
    
    del date_pu_do_freq
    del date_pu_do_mean
    
    # hour day
    hour_day_pu_do_freq = data.groupby(by=['PULocationID', 'DOLocationID', 'day', 'hour']).sum().reset_index()
    hour_day_pu_do_mean = data.groupby(by=['PULocationID', 'DOLocationID', 'day', 'hour']).mean().reset_index()
    
    hour_day_pu_do_freq = hour_day_pu_do_freq.loc[:, ['PULocationID', 'DOLocationID', 'day', 'hour', 'trip_count']]
    hour_day_pu_do_mean.drop(labels=['trip_count', 'RatecodeID'], axis=1, inplace = True)
    
    hour_day_pu_do_freq.reset_index().to_feather(f'hour-day-{month}-{year}-freq.feather')
    hour_day_pu_do_mean.reset_index().to_feather(f'hour-day-{month}-{year}-mean.feather')
    
    del hour_day_pu_do_freq
    del hour_day_pu_do_mean
    
    # RatecodeID
    
    rate_id_freq = data.groupby(by=['RatecodeID', 'PULocationID', 'day', 'hour']).mean().reset_index()
    rate_id_freq.drop(labels=['trip_count'], axis=1, inplace = True)
    rate_id_freq.reset_index().to_feather(f'rate-id-{month}-{year}-mean.feather')
    
    del rate_id_freq
    del data
    
In [43]:
dates = [['12', '2016'],['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
for each_dates in dates:
    print('Month:', each_dates[0])
    print('Year:', each_dates[1])
    fourth_preprocessing(each_dates[0], each_dates[1])
Month: 12
Year: 2016
Month: 01
Year: 2017
Month: 02
Year: 2017
Month: 12
Year: 2017
Month: 01
Year: 2018
Month: 02
Year: 2018
Month: 12
Year: 2018
Month: 01
Year: 2019
Month: 02
Year: 2019

Fifth layer preprocessing

In [44]:
file_name = ['pu-do-date-mean.feather', 
             'hour-day-freq.feather', 'hour-day-mean.feather',
             'rate-id-mean.feather']

for each_file in file_name:
    
    dates = [['01', '2017'],['02', '2017'], ['12','2017'], ['01', '2018'], 
             ['02', '2018'], ['12', '2018'], ['01', '2019'], ['02', '2019']]
    
    if each_file[:5]=='pu-do':
        
        metric = each_file[11:15]
            
        data = pd.read_feather(f"pu-do-date-12-2016-{metric}.feather")
        data.drop(labels=['index'], axis=1, inplace = True)

        for each_date in dates:
            open_data = pd.read_feather(f"pu-do-date-{each_date[0]}-{each_date[1]}-{metric}.feather")
            open_data.drop(labels=['index'], axis=1, inplace = True)
            data = pd.concat([data, open_data])

        data.reset_index().to_feather(each_file)
        
    elif each_file[:8]=='hour-day':
        
        metric = each_file[9:13]
        
        data = pd.read_feather(f"hour-day-12-2016-{metric}.feather")
        data.drop(labels=['index'], axis=1, inplace = True)
        
        for each_date in dates:
            open_data = pd.read_feather(f"hour-day-{each_date[0]}-{each_date[1]}-{metric}.feather")
            open_data.drop(labels=['index'], axis=1, inplace = True)
            data = pd.concat([data, open_data])

        data.reset_index().to_feather(each_file)
        
    else:
        
        data = pd.read_feather(f"rate-id-12-2016-mean.feather")
        data.drop(labels=['index'], axis=1, inplace = True)
        
        for each_date in dates:
            open_data = pd.read_feather(f"rate-id-{each_date[0]}-{each_date[1]}-mean.feather")
            open_data.drop(labels=['index'], axis=1, inplace = True)
            data = pd.concat([data, open_data])

        data.reset_index().to_feather(each_file)
        
    del data
    del open_data

    
In [45]:
taxi = pd.read_feather("pu-do-date-mean.feather")

Finish Preprocessing

Analysis and Visualisation

In [198]:
pu_do_freq = pd.read_feather('pu-do-date-freq.feather')
pu_do_mean = pd.read_feather('pu-do-date-mean.feather')
In [199]:
rate_id = pd.read_feather('rate-id-mean.feather')
In [200]:
hour_day_freq = pd.read_feather('hour-day-freq.feather')
hour_day_mean = pd.read_feather('hour-day-mean.feather')
In [201]:
# external dataset
snow = pd.read_csv('snow.csv')
snow = snow[snow.columns[2:]]
# shapefile and taxi zone
sf = gpd.read_file("taxi_file/taxi_zones.shp")
zone = pd.read_csv('taxi_file/taxi+_zone_lookup.csv')
In [202]:
zone_dic = {}
for index, row in zone.iterrows():
  zone_dic[row[0]] = row[1]
In [203]:
# helper function
def whisk_outliers(att, df):
    each_customer_transaction = df[att]
    q1 = each_customer_transaction.quantile(q=0.25)
    q3 = each_customer_transaction.quantile(q=0.75)
    iqr = q3 - q1
    left_bound_min = q1 - 1.5*iqr
    right_bound_max = q3 + 1.5*iqr
    return [left_bound_min, right_bound_max]

# conduct minor test by assuming both has normal distribution with unknown mean and std
# test hypothesis mean difference = 0
# this code was taken from https://pythonfordatascienceorg.wordpress.com/welch-t-test-python-pandas/
# and modified to match the purpose of this assignment

def welch_ttest(x, y, equal_var=False): 
    ## Welch-Satterthwaite Degrees of Freedom ##
    dof = (x.var()/x.size + y.var()/y.size)**2 / ((x.var()/x.size)**2 / (x.size-1) + (y.var()/y.size)**2 / (y.size-1))
   
    t, p = stats.ttest_ind(x, y, equal_var = equal_var)
    
    print("\n",
          f"Welch's t-test= {t:.4f}", "\n",
          f"p-value = {p:.8f}", "\n",
          f"Welch-Satterthwaite Degrees of Freedom= {dof:.4f}")

Creating the heatmap

In [7]:
grouped_date = pu_do_mean.groupby(by='Date').median().reset_index()
grouped_date = pd.merge(grouped_date, snow, left_on='Date', right_on='Date')
In [8]:
grouped_date_freq = pu_do_freq.groupby(by='Date').sum().reset_index()
grouped_date['trip_count'] = grouped_date_freq['trip_count']
In [9]:
plt.figure(figsize=(10, 6))
# this code is taken from mast30034 tutorial
sns.heatmap(grouped_date[['trip_distance', 'trip_count', 'MaxTemp', 'MinTemp',
                     'Average', 'New Snow', 'Snow Depth']].corr())
#plt.savefig('plot/correlation_plot.png')
plt.show()
In [10]:
grouped_date[['trip_distance', 'trip_count', 'MaxTemp', 'MinTemp',
                     'Average', 'New Snow', 'Snow Depth']].corr()
Out[10]:
trip_distance trip_count MaxTemp MinTemp Average New Snow Snow Depth
trip_distance 1.000000 0.148984 0.080249 0.115227 0.098543 -0.192378 -0.229815
trip_count 0.148984 1.000000 0.118810 0.157458 0.145484 -0.059837 0.086644
MaxTemp 0.080249 0.118810 1.000000 0.822046 0.951790 -0.188395 -0.193355
MinTemp 0.115227 0.157458 0.822046 1.000000 0.947133 -0.126446 -0.247353
Average 0.098543 0.145484 0.951790 0.947133 1.000000 -0.170872 -0.236783
New Snow -0.192378 -0.059837 -0.188395 -0.126446 -0.170872 1.000000 0.193014
Snow Depth -0.229815 0.086644 -0.193355 -0.247353 -0.236783 0.193014 1.000000

Trip frequency

In [12]:
pu_do_freq.drop(labels=['index'], axis=1, inplace=True)
grouped_date = pu_do_freq.groupby(by='Date').sum().reset_index()
grouped_date = pd.merge(grouped_date, snow, left_on='Date', right_on='Date')
In [13]:
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(grouped_date.index, grouped_date['trip_count'],"b-")
ax.axvline(90, color='g', linestyle='--', label='2017-2018')
ax.axvline(180, color='r', linestyle='--', label='2018-2019')
ticks = [i for i in range(0, 270, 30)]
ticks.append(269)
ax.set_xticks(ticks)
ax.set_xticklabels(list(grouped_date.loc[ticks, 'Date']))
ax.set_title('Time series plot on the trip frequency of New York Yellow Taxi\n during December-February from 2016-2019\n', fontsize=18)
ax.set_ylabel('Trip frequency')
ax.set_xlabel('Date')
ax.scatter(x=[24, 114, 204], y=[153998, 136866, 113671], c='r', label='Christmas')
plt.legend()
plt.annotate('391434 trips\n 2016-12-16\n Min temp: -7 Celcius', xy=(15, 391434), xytext=(50, 375000), 
             arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('113671 trips\n 2018-12-25\n Min temp: -1 Celcius', xy=(204, 113671), xytext=(220, 113671), 
             arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('Bomb Cyclone\n114150 trips\n2018-01-04\nSnow fall rate: 9 inch', xy=(124, 114150), xytext=(130, 135000), 
             arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('NA Blizzard\n192959 trips\n2017-02-09\nSnow fall rate: 10.3 inch', xy=(70, 192959), xytext=(35, 147500), 
             arrowprops=dict(facecolor='green', shrink=0.05))
#plt.savefig('plot/time_series_freq.png')
plt.show()
In [14]:
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))

# Create an axes instance
ax = fig.add_subplot(111)

# Create the boxplot
bp = ax.boxplot([grouped_date.loc[0:90, 'trip_count'],grouped_date.loc[90:180, 'trip_count'],
             grouped_date.loc[180:, 'trip_count']])
ax.set_xticklabels(['2016-2017', '2017-2018', '2018-2019'])
plt.ylabel('Trip count')
plt.title('Boxplot of trip frequency during different year period in December-February\n', fontsize=18)
# Save the figure
#fig.savefig('plot/boxplot_freq.png', bbox_inches='tight')
plt.show()
In [15]:
for i in range(0, 270, 90):
    print(f'Statistical inference of each period:')
    print(grouped_date.loc[i:i+90, 'trip_count'].describe())
    print('\n')
Statistical inference of each period:
count        91.000000
mean     305970.846154
std       41946.145459
min      153998.000000
25%      286383.500000
50%      307454.000000
75%      332474.000000
max      391434.000000
Name: trip_count, dtype: float64


Statistical inference of each period:
count        91.000000
mean     277147.175824
std       40702.103596
min      114150.000000
25%      251403.500000
50%      285155.000000
75%      304305.500000
max      341281.000000
Name: trip_count, dtype: float64


Statistical inference of each period:
count        90.000000
mean     234296.788889
std       34863.853292
min      113671.000000
25%      212244.750000
50%      240709.000000
75%      258765.000000
max      295516.000000
Name: trip_count, dtype: float64


In [18]:
index_outliers = []
index_period = [[0, 90], [90]]
for i in range(0, 270, 90):
    sliced_df = grouped_date.loc[i:i+90, :]
    lb, ub = whisk_outliers('trip_count', sliced_df)
    outliers_ub = sliced_df[sliced_df['trip_count']>ub]
    if len(list(outliers_ub.index))>0:
        index_outliers.append(list(outliers_ub.index))
    outliers_lb = sliced_df[sliced_df['trip_count']<lb]
    if len(list(outliers_lb.index))>0:
        index_outliers.append(list(outliers_lb.index))
index_outliers = [j for i in index_outliers for j in i]
grouped_date.loc[index_outliers, :]
Out[18]:
Date PULocationID DOLocationID RatecodeID trip_count trip_distance MaxTemp MinTemp Average New Snow Snow Depth
24 2016-12-25 1534733 1468358 11416 153998 525491.20 10 0 5 0.0 0.0
25 2016-12-26 1357002 1301777 10076 190517 637835.20 9 -2 3 0.0 0.0
32 2017-01-02 1347335 1293515 10234 210693 715905.18 3 -1 1 0.0 0.0
70 2017-02-09 1298152 1236160 9477 192959 502278.01 5 -8 -1 10.3 2.0
114 2017-12-25 1330885 1266942 10005 136866 456659.89 3 -1 1 0.4 0.0
124 2018-01-04 1066962 1025381 7893 114150 286304.65 -2 -5 -4 9.0 2.0
204 2018-12-25 1381636 1326213 12320 113671 394603.01 5 -1 2 0.0 0.0
In [19]:
#find the big drop
find_perc = grouped_date.copy()
find_perc['perc_change'] = find_perc['trip_count'].pct_change()
find_perc.dropna(inplace = True)
find_perc[find_perc['perc_change']<0].sort_values(by='perc_change', ascending=True)[:5]
Out[19]:
Date PULocationID DOLocationID RatecodeID trip_count trip_distance MaxTemp MinTemp Average New Snow Snow Depth perc_change
124 2018-01-04 1066962 1025381 7893 114150 286304.65 -2 -5 -4 9.0 2.0 -0.542364
70 2017-02-09 1298152 1236160 9477 192959 502278.01 5 -8 -1 10.3 2.0 -0.372211
24 2016-12-25 1534733 1468358 11416 153998 525491.20 10 0 5 0.0 0.0 -0.352688
204 2018-12-25 1381636 1326213 12320 113671 394603.01 5 -1 2 0.0 0.0 -0.335560
114 2017-12-25 1330885 1266942 10005 136866 456659.89 3 -1 1 0.4 0.0 -0.286759
In [20]:
try_predict = find_perc['trip_count'][1:]
perc_change = find_perc['trip_count'][:-1]
plt.scatter(try_predict, perc_change)
plt.ylabel("Next day frequency")
plt.xlabel("Previous frequency")
plt.title("Scatter plot of previous trip frequency vs next day trip frequency\n", fontsize=14)
plt.show()
In [106]:
np.corrcoef(try_predict, perc_change)
Out[106]:
array([[1.        , 0.81447971],
       [0.81447971, 1.        ]])
In [21]:
#find which day has the most decrease in trip demand
neg_dem = find_perc[find_perc['perc_change']<0]
neg_dem['Date'] = pd.to_datetime(neg_dem['Date'])
neg_dem['day'] = neg_dem['Date'].apply(lambda x: x.day_name())
pos_dem = find_perc[find_perc['perc_change']>0]
pos_dem['Date'] = pd.to_datetime(pos_dem['Date'])
pos_dem['day'] = pos_dem['Date'].apply(lambda x: x.day_name())
/Users/nathanaelyoewono/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/Users/nathanaelyoewono/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/Users/nathanaelyoewono/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/Users/nathanaelyoewono/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
In [22]:
mode_day_neg = pd.DataFrame(neg_dem['day'].value_counts())
mode_day_neg['day'] = round((mode_day_neg['day']/mode_day_neg['day'].sum()), 2)
mode_day_neg = mode_day_neg.loc[['Monday', 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], :]
mode_day_pos = pd.DataFrame(pos_dem['day'].value_counts())
mode_day_pos['day'] = round((mode_day_pos['day']/mode_day_pos['day'].sum()), 2)
mode_day_pos = mode_day_pos.loc[['Monday', 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday"], :]
In [23]:
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
fig.suptitle('Tally on percentage increase/decrease of trip frequency by day\n', fontsize=18)
ax[0].plot(mode_day_neg, color='r', label='Decrease')
ax[1].plot(mode_day_pos, color='g', label='Increase')

for ax in ax.flat:
    ax.set(ylabel='Percentage (%)')
    ax.legend()
#fig.savefig('plot/tally_percentage_trip_daily.png')
plt.show()
In [24]:
# check the quantil" plot for normality check 
sm.qqplot(grouped_date.loc[0:90, 'trip_count'], line ='45') 
py.show() 
In [26]:
print('Test whether the mean difference of trip frequency\nbetween 2016-2017 and 2018-2019:\n')
print('Normal distribution test for 2016-2017:')
print(stats.shapiro(grouped_date.loc[0:90, 'trip_count']))
print('\nNormal distribution test for 2018-2019:')
print(stats.shapiro(grouped_date.loc[180:, 'trip_count'])) #game day may have big p-value, but still stick to normal for simplicity
#welch_ttest(grouped_date.loc[0:90, 'trip_count'], grouped_date.loc[180:, 'trip_count'])
Test whether the mean difference of trip frequency
between 2016-2017 and 2018-2019:

Normal distribution test for 2016-2017:
(0.9531139135360718, 0.0024465112946927547)

Normal distribution test for 2018-2019:
(0.9689577221870422, 0.029878417029976845)
In [27]:
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(grouped_date.index, grouped_date['MinTemp'],"b-")
ax.axvline(90, color='g', linestyle='--', label='2017-2018')
ax.axvline(180, color='r', linestyle='--', label='2018-2019')
ticks = [i for i in range(0, 239, 30)]
ticks.append(238)
ax.set_xticks(ticks)
ax.set_xticklabels(list(grouped_date.loc[ticks, 'Date']))
ax.set_title('Time series plot on the minimum temperature forecast of New York December-Februrary 2016-2019\n', fontsize=18)
ax.set_ylabel('Temperature (Celcius)')
ax.set_xlabel('Date')
plt.legend()
plt.annotate('-18 degree celcius\n 2018-01-07\n Trip: 222426', xy=(127, -18), xytext=(140, -18), 
             arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('8 degree celcius\n 2017-12-05\n Trip: 308487', xy=(94, 8), xytext=(110, 5), 
             arrowprops=dict(facecolor='green', shrink=0.05))
#plt.savefig('time_series_mintemp.png')
plt.show()
In [28]:
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(grouped_date.index, grouped_date['Snow Depth'],"b-")
ax.axvline(90, color='g', linestyle='--', label='2017-2018')
ax.axvline(180, color='r', linestyle='--', label='2018-2019')
ticks = [i for i in range(0, 239, 30)]
ticks.append(238)
ax.set_xticks(ticks)
ax.set_xticklabels(list(grouped_date.loc[ticks, 'Date']))
ax.set_title('Time series plot on the snow depth forecast of New York December-Februrary 2016-2019\n', fontsize=18)
ax.set_ylabel('Depth (inch)')
ax.set_xlabel('Date')
plt.legend()
plt.annotate('9.0 inch snow\n 2017-02-10\n Trip: 318808', xy=(71, 9.0), xytext=(20, 7), 
             arrowprops=dict(facecolor='green', shrink=0.05))
#plt.annotate('8 degree celcius\n 2017-12-05\n Trip: 308487', xy=(94, 8), xytext=(110, 5), 
             #arrowprops=dict(facecolor='green', shrink=0.05))
#plt.savefig('time_series_snowdepth.png')
plt.show()
In [29]:
plt.figure(figsize=(12, 8))
correlation = np.corrcoef(grouped_date['MinTemp'], grouped_date['trip_count'])[0][-1]
ax = sns.regplot(x='MinTemp', y='trip_count', data=grouped_date, color="b", 
                 label=f'Correlation: {round(correlation, 2)}')
plt.title('Scatter plot of trip frequency vs minimum forecast temperature\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.xlabel('Minimum temperature (Celcius)')
plt.legend()

#plt.savefig('freq_mintemp_scatter_normal.png')
plt.show()
In [124]:
# trip count statistic inference for temperature 0
grouped_date[grouped_date['MinTemp']==0].describe()['trip_count']
Out[124]:
count        42.000000
mean     265128.119048
std       51972.224583
min      153998.000000
25%      233575.500000
50%      270568.000000
75%      300936.750000
max      351992.000000
Name: trip_count, dtype: float64
In [31]:
def label_snow_fall(x):
    if x<2: return 1
    elif x>=2 and x<4: return 2
    elif x>=4 and x<6: return 3
    else: return 4
In [32]:
grouped_date['SnowFallBins'] = grouped_date['New Snow'].apply(lambda x: label_snow_fall(x))
grouped_bins_snow_fall = grouped_date.groupby(by='SnowFallBins').mean().reset_index()
In [33]:
stats_analysis = pd.DataFrame(grouped_date[grouped_date['SnowFallBins']==1].describe()['trip_count'])
stats_analysis.columns = ['0-2 inch']
stats_analysis['2-4 inch'] = grouped_date[grouped_date['SnowFallBins']==2].describe()['trip_count']
stats_analysis['4-6 inch'] = grouped_date[grouped_date['SnowFallBins']==3].describe()['trip_count']
stats_analysis['>=6 inch'] = grouped_date[grouped_date['SnowFallBins']==4].describe()['trip_count']
stats_analysis
Out[33]:
0-2 inch 2-4 inch 4-6 inch >=6 inch
count 258.000000 5.000000 5.000000 2.000000
mean 271922.879845 292554.000000 321409.200000 153554.500000
std 48162.278837 9689.001445 36027.534771 55726.378319
min 113671.000000 283298.000000 285893.000000 114150.000000
25% 242323.500000 285089.000000 291094.000000 133852.250000
50% 274967.000000 291899.000000 318808.000000 153554.500000
75% 305422.750000 294811.000000 337415.000000 173256.750000
max 391434.000000 307673.000000 373836.000000 192959.000000
In [34]:
plt.figure(figsize=(12, 8))
labels=["0-2","2-4", "4-6", ">=6"]
ax = sns.barplot(x='SnowFallBins', y='trip_count', data=grouped_bins_snow_fall)
plt.title('Bar plot of trip frequency grouped by four range of snow fall rate\n', fontsize=18)
plt.ylabel('Trip frequency')
#plt.xtickslabels(labels)
plt.xlabel('inch')
ax.set_xticklabels(labels)
plt.savefig('plot/bar_average_trip_snow_fall.png')
plt.show()

Time Slider

In [204]:
sliced_christmas = pu_do_freq[pu_do_freq['Date'].isin(['2018-12-24','2018-12-25','2018-12-26'])]
sliced_christmas = sliced_christmas.groupby(by=['Date','PULocationID']).sum().reset_index()
sliced_christmas['PULocationID'] = sliced_christmas['PULocationID'].astype('str')
In [205]:
# seperate the data into bins first
bins=np.linspace(min(sliced_christmas['trip_count']),max(sliced_christmas['trip_count']),11)

# Coloring the location id based on certain attribute
sliced_christmas['color']=pd.cut(sliced_christmas['trip_count'],
                              bins,labels=['#FFEBEB','#F8D2D4','#F2B9BE','#EBA1A8','#E58892','#DE6F7C',
                                          '#D85766','#D13E50','#CB253A','#C50D24'],include_lowest=True)
sliced_christmas = sliced_christmas[['Date','PULocationID','color']]

# find the DOLocationID that has no trip count on that day
for date in sliced_christmas['Date'].unique():
    diff=set(zone['LocationID'].astype(str))-set(sliced_christmas[sliced_christmas['Date']==date]['PULocationID'])
    for i in diff:
      grouped_game=pd.concat([sliced_christmas,pd.DataFrame([[date,i,'#0073CF']],columns=['Date','PULocationID','color'])],ignore_index=True)
sliced_christmas.sort_values('Date',inplace=True)

sliced_christmas['Date'] = pd.to_datetime(sliced_christmas['Date'])
sliced_christmas['Date']=(sliced_christmas['Date'].astype(int)// 10**9).astype('U10')

# get the style dict
trip_dict={}
for i in sliced_christmas['PULocationID'].unique():
    trip_dict[i]={}
    for j in sliced_christmas[sliced_christmas['PULocationID']==i].set_index(['PULocationID']).values: 
        trip_dict[i][j[0]]={'color':j[1],'opacity':0.7}
In [206]:
# this code was recycled and edited from MAST30034 tutorial
# sf stands for shape file
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf['LocationID'] = sf['LocationID'].astype('str')
geoJSON = sf[['LocationID','geometry']].set_index('LocationID').to_json()

# plot the slider map
fig6=Figure(height=850,width=1000)
m6 = folium.Map([40.66, -73.94], tiles='cartodbpositron', zoom_start=10)
fig6.add_child(m6)

g = TimeSliderChoropleth(
    geoJSON,
    styledict=trip_dict,
).add_to(m6)

folium.Marker(location=[40.778805,-73.9703917],
              tooltip='<strong>Manhattan</strong>',
              icon=folium.Icon(color='red',prefix='fa', icon='star')).add_to(m6)
folium.Marker(location=[40.6413111,-73.7803278],
              tooltip='<strong>JFK NY Airport</strong>',
              icon=folium.Icon(color='purple',prefix='fa',icon='plane')).add_to(m6)
folium.Marker(location=[40.6895314,-74.1766511],
              tooltip='<strong>Newark NJ Airport</strong>',
              icon=folium.Icon(color='purple',prefix='fa',icon='plane')).add_to(m6)
folium.Marker(location=[40.7769271,-73.8761546],
              tooltip='<strong>LaGuardia Airport</strong>',
              icon=folium.Icon(color='purple',prefix='fa',icon='plane')).add_to(m6)
#m6.save('plot/last_christmas_timeslider.html')
m6
Out[206]:
In [115]:
pu_do_freq['PUBorough'] = pu_do_freq['PULocationID'].apply(lambda x: zone_dic[x])
pu_do_freq['DOBorough'] = pu_do_freq['DOLocationID'].apply(lambda x: zone_dic[x])
In [116]:
sliced_christmas = pu_do_freq[pu_do_freq['Date'].isin(['2018-12-24','2018-12-25','2018-12-26'])]
sliced_christmas_date = sliced_christmas.groupby(by=['Date']).sum().reset_index()
In [118]:
# check the median decrease in each day
sliced_christmas
Out[118]:
Date PULocationID DOLocationID RatecodeID trip_count trip_distance
0 2018-12-24 1517754 1450012 13591 171078 473114.77
1 2018-12-25 1381636 1326213 12320 113671 394603.01
2 2018-12-26 1407588 1346884 13143 169799 551367.66
In [120]:
# check the borough
sliced_christmas = pu_do_freq.groupby(by=['PUBorough', 'Date']).sum().reset_index()
sliced_christmas.loc[sliced_christmas['Date'].isin(['2018-12-24','2018-12-25','2018-12-26'])]
Out[120]:
PUBorough Date PULocationID DOLocationID RatecodeID trip_count trip_distance
203 Bronx 2018-12-24 64776 60130 606 493 2932.01
204 Bronx 2018-12-25 60579 57810 677 475 3249.24
205 Bronx 2018-12-26 48220 43873 581 336 2643.98
473 Brooklyn 2018-12-24 123775 152713 1822 1956 9673.70
474 Brooklyn 2018-12-25 109657 134156 1639 1565 8537.44
475 Brooklyn 2018-12-26 114875 139720 1672 1774 8520.94
649 EWR 2018-12-24 1 265 5 1 1.70
650 EWR 2018-12-26 1 1 5 1 0.80
882 Manhattan 2018-12-24 1077996 999948 8661 157146 362088.65
883 Manhattan 2018-12-25 977216 911797 7585 99936 266031.24
884 Manhattan 2018-12-26 1003830 933050 8410 149412 354826.50
1152 Queens 2018-12-24 222935 220335 2354 8446 90436.48
1153 Queens 2018-12-25 212647 208539 2311 9808 110230.25
1154 Queens 2018-12-26 210182 212740 2328 15246 176535.78
1388 Staten Island 2018-12-24 1320 793 17 9 139.73
1389 Staten Island 2018-12-25 1726 1323 17 10 168.16
1390 Staten Island 2018-12-26 892 752 6 7 60.71
1658 Unknown 2018-12-24 26951 15828 126 3027 7842.50
1659 Unknown 2018-12-25 19811 12588 91 1877 6386.68
1660 Unknown 2018-12-26 29588 16748 141 3023 8778.95
In [137]:
# check languardia: 138 and jfk: 132
id_airports = [132, 138]
sliced_christmas[sliced_christmas['PULocationID'].isin(id_airports)]
Out[137]:
Date PULocationID DOLocationID RatecodeID trip_count trip_distance
113 2018-12-24 132 36924 401 3608 56804.71
119 2018-12-24 138 29806 234 2406 22566.94
348 2018-12-25 132 36568 388 3979 64856.73
354 2018-12-25 138 29431 247 3778 34742.76
585 2018-12-26 132 38298 421 6157 100845.43
591 2018-12-26 138 31083 249 7128 65740.35

Chropleth of most impacted area due to blizzard

In [207]:
disaster = ['2018-01-04', '2017-02-09']
set_no_blizzard = set(pu_do_freq.index)-set(pu_do_freq[pu_do_freq['Date'].isin(disaster)].index)
normal_trend = pu_do_freq.loc[set_no_blizzard, :]
blizzard_trend = pu_do_freq[pu_do_freq['Date'].isin(disaster)]
In [208]:
normal_trend = normal_trend.groupby(by=['PULocationID', 'Date']).sum().reset_index()
blizzard_trend = blizzard_trend.groupby(by=['PULocationID', 'Date']).sum().reset_index()
In [209]:
normal_trend = normal_trend.groupby(by=['PULocationID']).mean().reset_index()
blizzard_trend = blizzard_trend.groupby(by=['PULocationID']).mean().reset_index()
In [210]:
normal_trend['trip_count'] = normal_trend['trip_count'].astype('int')
blizzard_trend['trip_count'] = blizzard_trend['trip_count'].astype('int')
In [211]:
id_lst = [i for i in blizzard_trend['PULocationID'] if i in normal_trend['PULocationID']]
In [212]:
normal_trend = normal_trend[normal_trend['PULocationID'].isin(id_lst)]
blizzard_trend = blizzard_trend[blizzard_trend['PULocationID'].isin(id_lst)]
normal_trend.set_index('PULocationID', inplace = True)
blizzard_trend.set_index('PULocationID', inplace = True)
normal_trend['diff'] = blizzard_trend['trip_count']/normal_trend['trip_count']
normal_trend.reset_index(inplace = True)
#normal_trend['diff'] = np.sqrt(normal_trend['diff'])
In [213]:
normal_trend['PULocationID'] = normal_trend['PULocationID'].astype('str')
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(pd.merge(normal_trend, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
# folium requires a geo JSON format for the geo_data arg, read more about it from the documentations
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()

m = folium.Map(location=[40.66, -73.94], tiles="cartodbpositron", zoom_start=10)
# refer to the folium documentations on how to plot aggregated data.
folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data=normal_trend,
    columns=['PULocationID', 'diff'],
    key_on='feature.properties.LocationID',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    zoomOut=False,
    legend_name='Daily decrease/increase trip frequency'
).add_to(m)


folium.LayerControl().add_to(m)




folium.Marker(location=[40.7769271,-73.8761546],
              tooltip='<strong>LaGuardia Airport: 1038 trips</strong>',
              icon=folium.Icon(color='red',prefix='fa',icon='plane')).add_to(m)
folium.Marker(location=[40.682334,-73.8826597],
              tooltip='<strong>Cypress Hills: 22 trips</strong>',
              icon=folium.Icon(color='blue',prefix='fa',icon='info')).add_to(m)
folium.Marker(location=[40.760338,-73.9536737],
              tooltip='<strong>Rosevelt Island: 17 trips</strong>',
              icon=folium.Icon(color='blue',prefix='fa',icon='info')).add_to(m)
folium.Marker(location=[40.830321,-73.9254947],
              tooltip='<strong>West Concourse: 41 trips</strong>',
              icon=folium.Icon(color='blue',prefix='fa',icon='info')).add_to(m)



#m.save('plot/blizzard_map.html')             
m
Out[213]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [64]:
normal_trend.sort_values(by='diff', ascending=False)[:20]
Out[64]:
PULocationID DOLocationID RatecodeID trip_count trip_distance diff
52 63 381.591093 5.931174 5 29.864534 4.400000
164 201 182.630769 2.261538 1 19.297692 2.000000
207 253 192.383721 2.465116 1 14.846047 2.000000
151 187 193.800000 2.933333 1 16.834000 2.000000
94 120 283.310734 2.203390 1 10.488136 2.000000
88 108 313.707763 5.566210 4 26.954703 1.750000
165 202 1274.421642 9.070896 10 43.671866 1.700000
4 8 438.858369 3.879828 2 14.067940 1.500000
43 53 343.908676 5.031963 2 26.270411 1.500000
14 20 378.732719 3.884793 2 16.838848 1.500000
96 122 399.389610 6.558442 2 27.852857 1.500000
173 212 711.824219 7.351562 5 28.364336 1.200000
65 78 710.369048 8.591270 5 32.799683 1.200000
203 247 3003.873134 25.052239 39 139.302313 1.051282
23 31 213.540741 2.125926 1 11.376815 1.000000
81 96 180.018692 2.074766 1 9.278879 1.000000
49 60 460.386266 5.017167 3 17.916824 1.000000
48 58 179.258065 1.645161 1 9.508871 1.000000
47 57 176.144000 1.848000 1 9.001680 1.000000
85 102 453.954918 5.975410 3 22.338648 1.000000
In [69]:
# percentage decrease in each borough
normal_trend['PULocationID'] = normal_trend['PULocationID'].astype(int)
normal_trend['PUBorough'] = normal_trend['PULocationID'].apply(lambda x: zone_dic[x])
borough_groupby = normal_trend.groupby(by='PUBorough').median()
1-borough_groupby['diff']
Out[69]:
PUBorough
Bronx            0.400000
Brooklyn         0.500000
EWR              0.000000
Manhattan        0.467771
Queens           0.366356
Staten Island   -0.500000
Unknown          0.475316
Name: diff, dtype: float64
In [129]:
# 91% taxi trips are from manhattan
trip_borough = pu_do_freq.groupby(by='PUBorough').sum()['trip_count']
trip_borough/trip_borough.sum()
Out[129]:
PUBorough
Bronx            0.001023
Brooklyn         0.012634
EWR              0.000003
Manhattan        0.915002
Queens           0.055267
Staten Island    0.000023
Unknown          0.016048
Name: trip_count, dtype: float64
In [70]:
# most increase
zone[zone['LocationID'].isin([63, 202, 247])]
Out[70]:
LocationID Borough Zone service_zone
62 63 Brooklyn Cypress Hills Boro Zone
201 202 Manhattan Roosevelt Island Boro Zone
246 247 Bronx West Concourse Boro Zone
In [135]:
# rosevelt island
print(normal_trend[normal_trend['PULocationID']==202])
print(blizzard_trend.loc[202])
     PULocationID  DOLocationID  RatecodeID  trip_count  trip_distance  diff  \
165           202   1274.421642    9.070896          10      43.671866   1.7   

     PUBorough  
165  Manhattan  
DOLocationID     1079.500
RatecodeID          6.500
trip_count         17.000
trip_distance      67.495
Name: 202, dtype: float64
In [83]:
# laguardia
blizzard_trend.loc[[138]]
Out[83]:
DOLocationID RatecodeID trip_count trip_distance
PULocationID
138 21155.0 168.5 1148 10661.55
In [82]:
# laguardia
normal_trend.loc[normal_trend['PULocationID']==138]
Out[82]:
PULocationID DOLocationID RatecodeID trip_count trip_distance diff PUBorough
112 138 31348.772388 261.544776 6611 64459.452313 0.17365 Queens

Hour and Day

In [87]:
hour_day_freq.head()
Out[87]:
index PULocationID DOLocationID Date day hour trip_count
0 0 1 1 2016-12-04 Sunday 8 1
1 1 1 1 2016-12-04 Sunday 15 1
2 2 1 1 2016-12-06 Tuesday 17 1
3 3 1 1 2016-12-11 Sunday 0 1
4 4 1 1 2016-12-11 Sunday 7 1
In [88]:
hour_day_freq = pd.merge(hour_day_freq, snow, left_on='Date', right_on='Date')
In [89]:
daily = hour_day_freq.groupby(by=['Date', 'day']).sum()
daily_mean = daily.groupby(by=['day']).mean()
daily_mean = daily_mean[['trip_count']]
daily_mean = daily_mean.loc[['Monday', 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], :]
In [92]:
# this is a tutorial code on how to use violin plots, taken from https://app.mode.com/modeanalytics/reports/76bb957b935c/details/notebook
sns.set(style="whitegrid")
daily.reset_index(inplace = True)
f, ax = plt.subplots(figsize=(7, 7))

# Show each distribution with both violins and points
sns.violinplot(x="day",y="trip_count",data=daily, 
               inner="box", palette="Set3", cut=2, linewidth=3)

sns.despine(left=True)

f.suptitle('Average daily trip frequency', fontsize=18, fontweight='bold')
ax.set_ylabel("Trip frequency",size = 16,alpha=0.7)
Out[92]:
Text(0, 0.5, 'Trip frequency')
In [93]:
stats_analysis = pd.DataFrame(daily[daily['day']=='Monday'].describe()['trip_count'])
stats_analysis.columns = ['Monday']
stats_analysis['Tuesday'] = daily[daily['day']=='Tuesday'].describe()['trip_count']
stats_analysis['Wednesday'] = daily[daily['day']=="Wednesday"].describe()['trip_count']
stats_analysis['Thursday'] = daily[daily['day']=="Thursday"].describe()['trip_count']
stats_analysis['Friday'] = daily[daily['day']=="Friday"].describe()['trip_count']
stats_analysis['Saturday'] = daily[daily['day']=="Saturday"].describe()['trip_count']
stats_analysis['Sunday'] = daily[daily['day']=="Sunday"].describe()['trip_count']
stats_analysis
Out[93]:
Monday Tuesday Wednesday Thursday Friday Saturday Sunday
count 39.000000 39.000000 38.000000 38.000000 38.000000 39.000000 39.000000
mean 244914.333333 266114.666667 278822.473684 284556.815789 299527.210526 286708.384615 246944.179487
std 43620.747827 45481.220708 40672.355589 50938.129902 43056.764143 48730.518904 46112.149634
min 136866.000000 113671.000000 169799.000000 114150.000000 201203.000000 195327.000000 153998.000000
25% 216848.500000 236914.500000 251259.500000 255996.000000 268667.750000 245535.500000 204070.500000
50% 246976.000000 274434.000000 284753.500000 300868.500000 301387.000000 285893.000000 245974.000000
75% 276575.500000 298776.500000 304787.000000 318454.750000 328784.000000 324287.500000 282595.500000
max 329512.000000 338153.000000 351992.000000 376027.000000 391434.000000 378996.000000 332680.000000
In [96]:
plt.figure(figsize=(10, 6))
days = [('Monday', 'b'), ('Tuesday', 'g'), ("Wednesday", 'r'), ("Thursday", 'c'), 
        ("Friday", 'm'), ("Saturday", 'y'), ("Sunday", 'k')]
for each_day in days:
    day_freq = hour_day_freq[hour_day_freq['day']==each_day[0]]
    day_freq = day_freq.groupby(by=['Date', 'hour']).sum()
    day_freq = day_freq.groupby(by=['hour']).mean()
    day_freq = day_freq[['trip_count']]
    plt.plot(day_freq, label=each_day[0], c=each_day[-1])
plt.title('Average hourly trip frequency in each day\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.legend()
plt.savefig('plot/trip_freq_daily_hourly_trend.png')
plt.show()
In [102]:
hour_day_freq = pd.merge(hour_day_freq, snow, left_on='Date', right_on='Date')
In [103]:
hour_day_freq.head()
Out[103]:
index PULocationID DOLocationID Date day hour trip_count MaxTemp MinTemp Average New Snow Snow Depth
0 0 1 1 2016-12-04 Sunday 8 1 7 -1 3 0.0 0.0
1 1 1 1 2016-12-04 Sunday 15 1 7 -1 3 0.0 0.0
2 25 1 238 2016-12-04 Sunday 18 1 7 -1 3 0.0 0.0
3 133 4 4 2016-12-04 Sunday 0 3 7 -1 3 0.0 0.0
4 134 4 4 2016-12-04 Sunday 1 3 7 -1 3 0.0 0.0
In [104]:
snow_day = hour_day_freq[hour_day_freq['New Snow']>0]
no_snow_day = hour_day_freq[hour_day_freq['New Snow']==0]
snow_day = snow_day.groupby(by=['Date', 'hour']).sum()
snow_day = snow_day.groupby(by=['hour']).mean()
no_snow_day = no_snow_day.groupby(by=['Date', 'hour']).sum()
no_snow_day = no_snow_day.groupby(by=['hour']).mean()
In [105]:
snow_day = snow_day[['trip_count']]
no_snow_day = no_snow_day[['trip_count']]
plt.figure(figsize=(10, 6))
plt.plot(snow_day, label='Snow day', c='b')
plt.plot(no_snow_day, label='No Snow day', c='r')
plt.title('Average Trip frequency hourly trend during snow and no snow day\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.legend()
#plt.savefig('plot/trip_freq_snow_nosnow_trend.png')
plt.show()

Chropleth of non working vs working hour

In [214]:
# split the hour into 2, 8am-7pm, 8pm-7am
non_working_hour = [20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7]
working_hour = list(range(8, 20))
working_hour = hour_day_freq[hour_day_freq['hour'].isin(working_hour)]
non_working_hour = hour_day_freq[hour_day_freq['hour'].isin(non_working_hour)]
working_hour = working_hour.groupby(by=['PULocationID', 'Date']).sum().reset_index()
working_hour = working_hour.groupby(by=['PULocationID']).mean().reset_index()
non_working_hour = non_working_hour.groupby(by=['PULocationID', 'Date']).sum().reset_index()
non_working_hour = non_working_hour.groupby(by=['PULocationID']).mean().reset_index()
In [215]:
working_hour['trip_count'] = working_hour['trip_count'].astype('int')
non_working_hour['trip_count'] = non_working_hour['trip_count'].astype('int')
diff_df = pd.DataFrame([])
diff_df['PULocationID'] = non_working_hour.PULocationID.astype('str')
diff_df['diff'] = np.log(working_hour['trip_count'])-np.log(non_working_hour['trip_count'])
In [216]:
#grouped_do_citi['trip_count'] = np.log(grouped_do_citi['trip_count'])
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(pd.merge(diff_df, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
# folium requires a geo JSON format for the geo_data arg, read more about it from the documentations
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()

m = folium.Map(location=[40.66, -73.94], tiles="cartodbpositron", zoom_start=10)
# refer to the folium documentations on how to plot aggregated data.
folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data=diff_df,
    columns=['PULocationID', 'diff'],
    key_on='feature.properties.LocationID',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    zoomOut=False,
    legend_name='Daily decrease/increase trip frequency'
).add_to(m)


folium.LayerControl().add_to(m)



folium.Marker(location=[40.6413111,-73.7803278],
              tooltip='<strong>JFK NY Airport: 1103 trips</strong>',
              icon=folium.Icon(color='orange',prefix='fa',icon='plane')).add_to(m)
folium.Marker(location=[40.7769271,-73.8761546],
              tooltip='<strong>LaGuardia Airport: 1038 trips</strong>',
              icon=folium.Icon(color='red',prefix='fa',icon='plane')).add_to(m)
folium.Marker(location=[40.697316,-73.9200917],
              tooltip='<strong>Bushwick, Greenpoint, Williamsburg</strong>',
              icon=folium.Icon(color='blue',prefix='fa',icon='info')).add_to(m)
folium.Marker(location=[40.721834,-73.9933407],
              tooltip='<strong>Lower East Side</strong>',
              icon=folium.Icon(color='purple',prefix='fa',icon='info')).add_to(m)



#m.save('plot/blizzard_map.html')             
m
Out[216]:
Make this Notebook Trusted to load map: File -> Trust Notebook

compare between holiday, blizzard and normal day

In [142]:
holiday_date = ['2016-12-25', '2017-12-25', '2018-12-25', 
                '2016-12-31', '2017-12-31', '2018-12-31', 
                '2016-02-14', '2017-02-14', '2018-02-14']
disaster = ['2018-01-04', '2017-02-09']
appended_date = ['2016-12-25', '2017-12-25', '2018-12-25', 
                '2016-12-31', '2017-12-31', '2018-12-31', 
                '2016-02-14', '2017-02-14', '2018-02-14', 
                '2018-01-04', '2017-02-09']
set_no_holiday = set(hour_day_freq.index)-set(hour_day_freq[hour_day_freq['Date'].isin(appended_date)].index)
holiday_trend = hour_day_freq[hour_day_freq['Date'].isin(holiday_date)]
normal_trend = hour_day_freq.loc[set_no_holiday, :]
blizzard_trend = hour_day_freq[hour_day_freq['Date'].isin(disaster)]
In [143]:
holiday_trend = holiday_trend.groupby(by=['Date', 'hour']).sum()
holiday_trend = holiday_trend.groupby(by=['hour']).mean()
normal_trend = normal_trend.groupby(by=['Date', 'hour']).sum()
normal_trend = normal_trend.groupby(by=['hour']).mean()
blizzard_trend = blizzard_trend.groupby(by=['Date', 'hour']).sum()
blizzard_trend = blizzard_trend.groupby(by=['hour']).mean()
In [144]:
holiday_trend = holiday_trend[['trip_count']]
normal_trend = normal_trend[['trip_count']]
blizzard_trend = blizzard_trend[['trip_count']]
plt.figure(figsize=(10, 6))
plt.plot(normal_trend, label='Normal', c='b')
plt.plot(holiday_trend, label='Holiday', c='r')
plt.plot(blizzard_trend, label='Blizzard', c='g')
plt.title('Average Trip frequency hourly trend during holiday, normal and blizzard\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.legend()
#plt.savefig('plot/trip_freq_diff_day.png')
plt.show()
In [145]:
stats_df = pd.DataFrame(blizzard_trend.loc[4:8].describe())
stats_df.columns = ['Blizzard_4-9am']
stats_df['Holiday_4-9am'] = holiday_trend.loc[4:8].describe()
stats_df
Out[145]:
Blizzard_4-9am Holiday_4-9am
count 5.000000 5.000000
mean 4668.700000 4069.875000
std 2689.296762 2323.029915
min 1540.000000 1769.000000
25% 2188.500000 2119.750000
50% 5248.500000 3576.375000
75% 7124.000000 5738.500000
max 7242.500000 7145.750000

Trip distance

In [146]:
grouped_date = pu_do_mean.groupby(by='Date').median().reset_index()
grouped_date = pd.merge(grouped_date, snow, left_on='Date', right_on='Date')
In [147]:
fig, ax = plt.subplots(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.plot(grouped_date.index, grouped_date['trip_distance'],"b-")
ax.axvline(90, color='g', linestyle='--', label='2017-2018')
ax.axvline(180, color='r', linestyle='--', label='2018-2019')
ticks = [i for i in range(0, 270, 30)]
ticks.append(269)
ax.set_xticks(ticks)
ax.set_xticklabels(list(grouped_date.loc[ticks, 'Date']))
ax.set_title('Time series plot on the median trip distance of New York Yellow Taxi\n during December-February from 2016-2019\n', fontsize=18)
ax.set_ylabel('Trip distance (miles)')
ax.set_xlabel('Date')

ax.scatter(x=grouped_date[grouped_date['Snow Depth']>5].index, 
           y=grouped_date[grouped_date['Snow Depth']>5]['trip_distance'], c='r', label='Thick Snow')
ax.scatter(x=grouped_date[grouped_date['New Snow']>5].index, 
           y=grouped_date[grouped_date['New Snow']>5]['trip_distance'], c='g', label='Heavy Snow Fall')
plt.legend()

plt.annotate('Bomb Cyclone\n4.59 miles\n2018-01-04\nSnow fall rate: 9 inch', xy=(124, 4.585), xytext=(135, 4.585), 
             arrowprops=dict(facecolor='green', shrink=0.05))
plt.annotate('NA Blizzard\n5.08 miles\n2017-02-09\nSnow fall rate: 10.3 inch', xy=(70, 5.075556), xytext=(35, 4.9), 
             arrowprops=dict(facecolor='green', shrink=0.05))

#plt.savefig('plot/time_series_dist.png')
plt.show()
In [148]:
grouped_date[grouped_date['New Snow']>5]
Out[148]:
Date index PULocationID DOLocationID trip_distance tip_amount fare_amount extra tolls_amount time_taken speed MaxTemp MinTemp Average New Snow Snow Depth
16 2016-12-17 164991.5 144.0 142.0 5.974500 2.020000 21.666667 0.280762 0.0 0.384991 15.124076 4 -2 1 5.3 2.0
37 2017-01-07 154134.0 144.0 142.0 5.500000 1.821250 19.500000 0.250000 0.0 0.331944 16.240770 -3 -6 -5 5.7 0.5
70 2017-02-09 143830.0 144.0 142.0 5.075556 1.951000 18.000000 0.500000 0.0 0.307222 16.034483 5 -8 -1 10.3 2.0
71 2017-02-10 141107.0 143.0 141.0 5.534286 2.201471 20.835526 0.479167 0.0 0.378889 14.402074 0 -8 -4 5.2 9.0
124 2018-01-04 146249.0 144.0 143.0 4.585000 1.598000 17.000000 0.500000 0.0 0.308611 14.677288 -2 -5 -4 9.0 2.0
In [149]:
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))

# Create an axes instance
ax = fig.add_subplot(111)

# Create the boxplot
bp = ax.boxplot([grouped_date.loc[0:90, 'trip_distance'],grouped_date.loc[90:180, 'trip_distance'],
             grouped_date.loc[180:, 'trip_distance']])
ax.set_xticklabels(['2016-2017', '2017-2018', '2018-2019'])
plt.ylabel('Trip distance (miles)')
plt.title('Boxplot of average total daily trip distance during\ndifferent year period in December-February\n', fontsize=18)
# Save the figure
#fig.savefig('plot/boxplot_dist.png', bbox_inches='tight')
plt.show()
In [150]:
# central tendency here!!!
date = [0, 90, 180]
for i in date:
    print('Descriptive analysis of each year period')
    print(grouped_date.loc[i:i+90, 'trip_distance'].describe())
    print("\n")
Descriptive analysis of each year period
count    91.000000
mean      5.588215
std       0.200667
min       5.075556
25%       5.446875
50%       5.593333
75%       5.721625
max       6.180000
Name: trip_distance, dtype: float64


Descriptive analysis of each year period
count    91.000000
mean      5.439869
std       0.187102
min       4.585000
25%       5.318750
50%       5.430000
75%       5.523180
max       5.923214
Name: trip_distance, dtype: float64


Descriptive analysis of each year period
count    90.000000
mean      5.785371
std       0.164058
min       5.330000
25%       5.692500
50%       5.779083
75%       5.865703
max       6.170000
Name: trip_distance, dtype: float64


In [151]:
print('Test whether the mean difference of average trip distance\nbetween 2016-2017 and 2018-2019:\n')
print('Normal distribution test for 2016-2017:')
print(stats.shapiro(grouped_date.loc[0:90, 'trip_distance']))
print('\nNormal distribution test for 2018-2019:')
print(stats.shapiro(grouped_date.loc[180:, 'trip_distance'])) #game day may have big p-value, but still stick to normal for simplicity
welch_ttest(grouped_date.loc[0:90, 'trip_distance'], grouped_date.loc[180:, 'trip_distance'])
Test whether the mean difference of average trip distance
between 2016-2017 and 2018-2019:

Normal distribution test for 2016-2017:
(0.991009533405304, 0.7969563007354736)

Normal distribution test for 2018-2019:
(0.9829167127609253, 0.2859242856502533)

 Welch's t-test= -7.2400 
 p-value = 0.00000000 
 Welch-Satterthwaite Degrees of Freedom= 172.8988
In [152]:
# check outliers
index_outliers = []
index_period = [[0, 90], [90]]
for i in range(0, 270, 90):
    sliced_df = grouped_date.loc[i:i+90, :]
    lb, ub = whisk_outliers('trip_distance', sliced_df)
    outliers_ub = sliced_df[sliced_df['trip_distance']>ub]
    if len(list(outliers_ub.index))>0:
        index_outliers.append(list(outliers_ub.index))
    outliers_lb = sliced_df[sliced_df['trip_distance']<lb]
    if len(list(outliers_lb.index))>0:
        index_outliers.append(list(outliers_lb.index))
index_outliers = [j for i in index_outliers for j in i]
grouped_date.loc[index_outliers, :]
Out[152]:
Date index PULocationID DOLocationID trip_distance tip_amount fare_amount extra tolls_amount time_taken speed MaxTemp MinTemp Average New Snow Snow Depth
31 2017-01-01 151909.0 143.0 141.0 6.180000 1.438571 21.250000 0.333333 0.0 0.328657 17.970356 9 0 4 0.0 0.0
105 2017-12-16 153190.5 144.0 142.0 5.886136 2.027600 21.688988 0.250000 0.0 0.387500 14.751750 2 -6 -1 1.0 2.0
121 2018-01-01 144841.0 144.0 141.0 5.923214 1.500000 20.500000 0.307692 0.0 0.319002 17.572090 -8 -16 -12 0.0 1.0
124 2018-01-04 146249.0 144.0 143.0 4.585000 1.598000 17.000000 0.500000 0.0 0.308611 14.677288 -2 -5 -4 9.0 2.0
191 2018-12-12 163045.0 143.0 141.0 6.145273 2.050347 23.065714 0.428571 0.0 0.426782 14.119645 5 -1 1 0.0 0.0
193 2018-12-14 164152.5 143.0 141.0 6.170000 2.160000 23.500000 0.476303 0.0 0.447901 13.633490 8 0 4 0.0 0.0
230 2019-01-20 155744.5 143.0 142.0 5.347237 1.525000 18.800000 0.178571 0.0 0.297384 17.244750 3 -12 -4 0.7 1.0
231 2019-01-21 156418.0 144.0 142.0 5.330000 1.565909 18.833333 0.125000 0.0 0.295278 17.358650 -8 -15 -11 0.5 0.0
In [153]:
grouped_date_freq = pu_do_freq.groupby(by=['Date']).sum().reset_index()
grouped_date_freq['trip_distance_mean'] = grouped_date['trip_distance']
grouped_date_freq = pd.merge(grouped_date_freq, snow, left_on='Date', right_on='Date')
grouped_date_freq['yes_snow'] = grouped_date_freq['Snow Depth'].apply(lambda x: 'Snow' if x else 'No snow')
# remove outliers by inspection
grouped_date_freq = grouped_date_freq[grouped_date_freq['trip_distance']>5]
grouped_date_freq = grouped_date_freq[grouped_date_freq['trip_count']>133251]
In [154]:
# correlation between trip frequency and trip distance during snow
np.corrcoef(grouped_date_freq[grouped_date_freq['yes_snow']=='Snow']['trip_count'], 
            grouped_date_freq[grouped_date_freq['yes_snow']=='Snow']['trip_distance_mean'])
Out[154]:
array([[1.        , 0.38000712],
       [0.38000712, 1.        ]])
In [155]:
# correlation between trip frequency and trip distance during no snow
np.corrcoef(grouped_date_freq[grouped_date_freq['yes_snow']=='No snow']['trip_count'], 
            grouped_date_freq[grouped_date_freq['yes_snow']=='No snow']['trip_distance_mean'])
Out[155]:
array([[1.        , 0.05930834],
       [0.05930834, 1.        ]])
In [156]:
plt.figure(figsize=(12, 8))
grouped_date_freq['trip_count_log'] = np.log(grouped_date_freq['trip_count'])
correlation = np.corrcoef(grouped_date_freq['trip_count_log'], grouped_date_freq['trip_distance_mean'])[0][-1]
ax = sns.lmplot(x="trip_distance_mean", y="trip_count_log",
                data=grouped_date_freq, fit_reg=True, hue='yes_snow', legend=True)

#sns.regplot(x=np.log(grouped_date_freq['trip_count']), y='trip_distance_mean', data=grouped_date_freq, color="b", 
                 #label=f'Correlation: {round(correlation, 2)}')
plt.title('Scatter plot of log transformed trip frequency \nvs average total daily trip distance\n', fontsize=14)
plt.xlabel('Trip distance (miles)')
plt.ylabel('Log transformed trip frequency')
plt.legend()

#plt.savefig('plot/dist_freq_snow.png')
plt.show()
<Figure size 864x576 with 0 Axes>

Choropleth of average trip distance increase/decrease based on pickup location id during blizzard

In [217]:
pu_do_mean = pd.merge(pu_do_mean, snow, left_on='Date', right_on='Date')
In [218]:
snow_trip_sliced = pu_do_mean[pu_do_mean['Snow Depth']>=5]
no_snow_trip_sliced = pu_do_mean[pu_do_mean['Snow Depth']<5]
snow_trip = snow_trip_sliced.groupby(by=['PULocationID']).median().reset_index()
no_snow_trip = no_snow_trip_sliced.groupby(by=['PULocationID']).median().reset_index()
snow_trip = snow_trip[['PULocationID','trip_distance']]
no_snow_trip = no_snow_trip[['PULocationID','trip_distance']]
In [219]:
snow_trip = pd.merge(snow_trip, no_snow_trip, left_on='PULocationID', right_on='PULocationID')
In [220]:
snow_trip['diff'] = snow_trip['trip_distance_x']-snow_trip['trip_distance_y']
In [221]:
#grouped_do_citi['trip_count'] = np.log(grouped_do_citi['trip_count'])
snow_trip['PULocationID'] = snow_trip['PULocationID'].astype('str')
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(pd.merge(snow_trip, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
# folium requires a geo JSON format for the geo_data arg, read more about it from the documentations
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()

m = folium.Map(location=[40.66, -73.94], tiles="cartodbpositron", zoom_start=10)
# refer to the folium documentations on how to plot aggregated data.
folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data=snow_trip,
    columns=['PULocationID', 'diff'],
    key_on='feature.properties.LocationID',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    zoomOut=False,
    legend_name='Decrease/Increase average distance travelled'
).add_to(m)


folium.LayerControl().add_to(m)




#m.save('plot/snow_distance_map.html')             
m
Out[221]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [222]:
snow_trip['PUBorough'] = snow_trip['PULocationID'].astype(int).apply(lambda x: zone_dic[x])
In [167]:
snow_trip.groupby(by='PUBorough').mean()
Out[167]:
trip_distance_x trip_distance_y diff
PUBorough
Bronx 3.419375 5.141312 -1.721937
Brooklyn 3.438813 4.578084 -1.139271
EWR 12.500000 13.900000 -1.400000
Manhattan 5.450931 5.474335 -0.023404
Queens 6.234179 7.932297 -1.698119
Staten Island 7.870623 8.822143 -0.951520
Unknown 3.553690 3.205000 0.348690
In [175]:
# check jfk and laguardia
snow_trip[snow_trip['PULocationID'].isin(['132', '138'])]
Out[175]:
PULocationID trip_distance_x trip_distance_y diff PUBorough
123 132 17.680806 17.617687 0.063119 Queens
129 138 10.418285 10.370000 0.048285 Queens
In [176]:
# bronx park
snow_trip_sliced[snow_trip_sliced['PULocationID']==31]
Out[176]:
index PULocationID DOLocationID Date trip_distance tip_amount fare_amount extra tolls_amount time_taken speed MaxTemp MinTemp Average New Snow Snow Depth
649028 16460 31 47 2017-02-14 4.48 0.00 13.5 0.5 0.0 0.111111 40.320000 3 -5 0 0.0 7.0
688101 16469 31 230 2017-02-17 14.14 2.00 40.5 0.5 0.0 0.502500 28.139303 5 -3 1 0.0 5.0
760234 16458 31 3 2017-02-10 3.32 2.66 12.0 0.5 0.0 0.205000 16.195122 0 -8 -4 5.2 9.0
1382651 16763 31 43 2018-01-06 10.52 0.00 31.0 0.5 0.0 0.468889 22.436019 -10 -16 -13 0.0 7.0
1382652 16768 31 166 2018-01-06 9.80 5.95 29.0 0.0 0.0 0.389444 25.164051 -10 -16 -13 0.0 7.0
1401401 16765 31 113 2018-01-10 15.59 0.00 45.5 0.0 0.0 0.761667 20.468271 2 -7 -2 0.0 5.0

Check negotiated fare

In [178]:
#check_mean_distance['trip_count'] = 1
grouped_by_date = pu_do_freq.groupby(by=['Date', 'RatecodeID']).sum().reset_index()[['Date', 'RatecodeID', 'trip_count']]
grouped_by_date = grouped_by_date[grouped_by_date['RatecodeID']!=1]
In [179]:
first_per = grouped_by_date.loc[:456]
first_per['RatecodeID'] = first_per['RatecodeID'].astype('str')
second_per = grouped_by_date.loc[456:911]
second_per['RatecodeID'] = second_per['RatecodeID'].astype('str')
third_per = grouped_by_date.loc[911:]
third_per['RatecodeID'] = third_per['RatecodeID'].astype('str')
/Users/nathanaelyoewono/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/Users/nathanaelyoewono/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/Users/nathanaelyoewono/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [180]:
plt.figure(figsize=(8, 6))
sns.distplot(np.log(first_per[first_per['RatecodeID']=='5']['trip_count']), label='2016-2017')
sns.distplot(np.log(second_per[second_per['RatecodeID']=='5']['trip_count']), label='2017-2018')
sns.distplot(np.log(third_per[third_per['RatecodeID']=='5']['trip_count']), label='2018-2019')
plt.title('Distribution plot of negotiated fare trip frequency\n', fontsize=18)
plt.legend()
#plt.savefig('plot/dist_nego.png')
plt.show()
In [183]:
first_per = first_per.groupby(by='RatecodeID').sum()
second_per = second_per.groupby(by='RatecodeID').sum()
third_per = third_per.groupby(by='RatecodeID').sum()
In [184]:
plt.figure(figsize=(8, 6))
plt.plot(first_per, c='r', label='2016-2017')
plt.plot(second_per, c='b', label='2017-2018')
plt.plot(third_per, c='g', label='2018-2019')
plt.legend()
plt.title('Rate code trip increase/decrease in each period\n', fontsize=18)
plt.ylabel('Trip frequency')
plt.xlabel('Rate code id')
#plt.savefig('plot/rate_trend.png')
plt.show()
In [188]:
# rate code id 5 is the negoiated fare
third_per/first_per
Out[188]:
trip_count
RatecodeID
2 0.835063
3 0.726871
4 1.085061
5 4.574381
6 0.800000
In [193]:
check_distance = rate_id.groupby(by=['RatecodeID']).median()
check_distance
Out[193]:
index PULocationID hour trip_distance DOLocationID tip_amount fare_amount extra tolls_amount time_taken speed
RatecodeID
1 41666.0 143.0 12.0 2.635000 150.670959 1.380748 11.378151 0.495851 0.000 0.204054 12.771784
2 95221.0 158.0 13.0 17.759412 132.000000 8.770000 52.000000 0.000000 5.540 0.636709 27.350883
3 112308.0 162.0 11.0 17.420000 1.000000 12.000000 66.500000 0.000000 13.875 0.570694 30.618947
4 117048.5 138.0 14.0 17.457333 265.000000 5.830000 60.400000 0.500000 0.000 0.527778 32.144954
5 129192.0 141.0 12.0 10.320000 164.000000 0.000000 39.530000 0.000000 0.000 0.498333 20.837111
6 121640.0 114.0 9.0 0.190000 88.000000 0.000000 5.500000 0.000000 0.000 0.075833 7.275701
In [223]:
pu_rate_5 = pu_do_freq[pu_do_freq['RatecodeID']==5]
pu_rate_5 = pu_rate_5[pu_rate_5['Date']>='2018-12-01']
pu_rate_5 = pu_rate_5.groupby(by=['PULocationID']).sum().reset_index()
pu_rate_5 = pu_rate_5[['PULocationID', 'trip_count']]
pu_rate_5 = pu_rate_5.sort_values(by='trip_count', ascending = False)
pu_rate_5[:10]
Out[223]:
PULocationID trip_count
45 48 1956
124 132 1772
162 170 1728
132 140 1675
220 231 1600
129 137 1587
65 68 1552
219 230 1552
153 161 1343
39 42 1324
In [195]:
zone[zone['LocationID'].isin(pu_rate_5[:10]['PULocationID'])]
Out[195]:
LocationID Borough Zone service_zone
41 42 Manhattan Central Harlem North Boro Zone
47 48 Manhattan Clinton East Yellow Zone
67 68 Manhattan East Chelsea Yellow Zone
131 132 Queens JFK Airport Airports
136 137 Manhattan Kips Bay Yellow Zone
139 140 Manhattan Lenox Hill East Yellow Zone
160 161 Manhattan Midtown Center Yellow Zone
169 170 Manhattan Murray Hill Yellow Zone
229 230 Manhattan Times Sq/Theatre District Yellow Zone
230 231 Manhattan TriBeCa/Civic Center Yellow Zone

Choropleth showing the trip frequency of rate code id 5

In [224]:
pu_rate_5['PULocationID'] = pu_rate_5['PULocationID'].astype('str')
#grouped_do_citi['trip_count'] = np.log(grouped_do_citi['trip_count'])
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf = gpd.GeoDataFrame(pd.merge(pu_rate_5, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
# folium requires a geo JSON format for the geo_data arg, read more about it from the documentations
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()

m = folium.Map(location=[40.66, -73.94], tiles="cartodbpositron", zoom_start=10)
# refer to the folium documentations on how to plot aggregated data.
folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data=pu_rate_5,
    columns=['PULocationID', 'trip_count'],
    key_on='feature.properties.LocationID',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    zoomOut=False,
    legend_name='Trip frequency'
).add_to(m)


folium.LayerControl().add_to(m)



folium.Marker(location=[40.6413111,-73.7803278],
              tooltip='<strong>JFK NY Airport</strong>',
              icon=folium.Icon(color='purple',prefix='fa',icon='plane')).add_to(m)



#m.save('plot/rate_code_five.html')             
m
Out[224]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]: